library(tinytex)
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.0
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggplot2)
my.data <- read.csv('./netflix_titles.csv')
my.data

First Data Visualization Technique – Pie Chart

data <- my.data %>%
        group_by(type) %>%
        summarize(counts = n(),
                  percentage = n()/nrow(my.data))

ggplot(data, aes(x = "", y=percentage, fill = type)) + 
  geom_bar(width = 1, stat = "identity") +coord_polar(theta = "y", start=0)+
scale_fill_brewer(palette="Blues")+
  labs(fill="Type", 
       x=NULL, 
       y=NULL, 
       title="# of Movies vs TV Shows on Netflix ") + 
geom_text(aes(label = paste0(round(percentage*100),'%')),size=8, position = position_stack(vjust = 0.5))+
theme_void()+theme(plot.title = element_text(hjust=0.5,size=22))

We can see that 70% of listings on Netflix are Movies and only 30% are TV Shows.which makes sense because every movie gets its own title however every tv show has many episodes and seasons under the same title.

Second Data Visualization Technique – Histogram

movies <- my.data %>% select(type, duration) %>%
  filter(type == "Movie") %>%
  drop_na() %>% 
  mutate(mins = parse_number(duration))
movies %>%
  plot_ly(
    x = ~ mins,
    type = "histogram",
    nbinsx = 40,
    marker = list(
      color = "red",
      line = list(color = "black",
                  width = 1))
  ) %>%
  layout(
    title = "Movie Durations",
    yaxis = list(title = "Number of Movies",
                 zeroline = FALSE),
    xaxis = list(title = "Number of Minutes",
                 zeroline = FALSE)) 
## Warning: Ignoring 3 observations

We can see the the the highest frequency of movies are about 90-99 mins long. The histogram is also left skewed.

Second Data Visualization Technique – Bar Graph

my.data %>% select(rating, type) %>%
  filter(!is.na(rating)) %>%
  mutate(rating = fct_lump(rating, 5)) %>%
  group_by(rating, type) %>%
  summarise(freq = n()) %>%
  arrange(freq) %>%
  plot_ly(x = ~ type ,
          y = ~ freq,
          type = "bar",
          color = ~ rating,
          text = ~ freq,
          textposition = 'outside',
          textfont = list(color = 'black', size = 12)) %>%
  layout(yaxis = list(categoryorder = "array",
                      categoryarray = ~ freq)) %>%
  layout(
    title = "# of Movies % TV Shows According to their Ratings",
    yaxis = list(title = "Type and Their Ratings"),
    xaxis = list(title = "Frequency"),
    legend = list(title = list(text = 'Rating')))
## `summarise()` has grouped output by 'rating'. You can override using the
## `.groups` argument.

We can see the TV-MA is the most popular between both Movies and TV Shows. The least Popular is PG-13 for Movies and R for TV Shows, Which makes sense because you are normally not allowed to show R things on television.